# author: j.rehmert@hphd.hertie-school.org
# project: Coalition Probabilities
# content: function to obtain top three predicted coalitions and descriptives



getTopThree <- function(dat = "dat", group.var = "cabinet_id", preds.var = "expected", true.coal = "realg"){
  ##
  ## This function returns descriptives of the top three predicted coalitions
  ##
  ## Args.:
  ##	dat:		A data frame; default is "dat"
  ## 	group.var: 	A variable indicating the grouping variable; default is "cabinet_id
  ## 	preds.var: 		A variable containing predicted probabilities; default is "expected
  ##  true.coal: A variable indicating observed coalition (0,1)
  
  ## Returns:
  ##	A list with several descriptive statistics
  ## 
  
  # load required libraries
  #library(dplyr)
  # order data according to cabinet and (increasing) predicted probability
  dat$expected <- dat[[preds.var]]
  dat$true <- dat[[true.coal]]
  dat <- dat[order(dat[[group.var]], -dat[[preds.var]]),]
  # add column containing within cabinet_id ranking of predicted probability
  dat$rank <- ave(dat[[preds.var]], dat[[group.var]], FUN = function(x) rank(-x, ties.method = "first"))
  
  dist.of.true = dat$rank[dat[[true.coal]] == 1]
  
  prob.of.true = dat$expected[dat[[true.coal]] == 1]
  
  
  
  restricted <- dat[dat$rank<=3,]
  n <- length(unique(dat[[group.var]]))
  # number of parties in true and predicted coalition
  numpar.true <- dat$numpar[dat$true == 1]
  numpar.pred <- dat$numpar[dat$rank == 1]
  numpar.diff <- numpar.true - numpar.pred
  
  # observed coalitions and their predicted rank
  preds <- table(dat$rank, dat[[true.coal]])
  
  # discrepancy in predicted probabilities between true 2nd ranks and wrongly predicted 1st rank
  prob.dist.2 <- abs(restricted$expected[which(restricted[[true.coal]] == 1 & restricted$rank == 2)] - 
                       restricted$expected[which(restricted[[true.coal]] == 1 & restricted$rank == 2) - 1])
  
  numpar.diff.2 <-  (restricted$numpar[which(restricted[[true.coal]] == 1 & restricted$rank == 2)] - 
                       restricted$numpar[which(restricted[[true.coal]] == 1 & restricted$rank == 2) - 1])
  # discrepancy in predicted probabilities between true 3rd ranks and wrongly predicted 1st rank
  prob.dist.3 <- abs(restricted$expected[which(restricted[[true.coal]] == 1 & restricted$rank == 3)] - 
                       restricted$expected[which(restricted[[true.coal]] == 1 & restricted$rank == 3) - 2])
  
  numpar.diff.3 <- (restricted$numpar[which(restricted[[true.coal]] == 1 & restricted$rank == 3)] - 
                      restricted$numpar[which(restricted[[true.coal]] == 1 & restricted$rank == 3) - 2])
  # actually formed govs that we predicted onto second place
  similar.dist.2 <- abs(restricted$similar[which(restricted[[true.coal]] == 1 & restricted$rank == 2)]
                        - restricted$similar[(which(restricted[[true.coal]] == 1 & restricted$rank == 2) - 1)])
  
  # actually formed govs that we predicted onto third place
  similar.dist.3 <- abs(restricted$similar[(which(restricted[[true.coal]] == 1 & restricted$rank == 3) - 2)]
                        - restricted$similar[which(restricted[[true.coal]] == 1 & restricted$rank == 3)])
  
  # look into party combinations
  require(data.table)
  dt <- data.table(dat)
  tmp.pred <- as.data.frame(dt[, list(cab_comp[which(rank==1)]), by = cabinet_id])
  tmp1 <- as.data.frame(dt[, list(cab_comp[which(rank==1 & realg==1)]), by = cabinet_id])
  tmp2 <- as.data.frame(dt[, list(cab_comp[which(rank==2 & realg==1)]), by = cabinet_id])
  tmp3 <- as.data.frame(dt[, list(cab_comp[which(rank==3 & realg==1)]), by = cabinet_id])
  tmp4 <- as.data.frame(dt[, list(cab_comp[which(rank==4 & realg==1)]), by = cabinet_id])
  tmp5 <- as.data.frame(dt[, list(cab_comp[which(rank==5 & realg==1)]), by = cabinet_id])
  
  
  pty <- tmp.pred;colnames(pty)[2] <- "predicted_coalition"
  pty <- merge(pty, tmp1, by = group.var, all = TRUE)
  colnames(pty)[3] = "true_1st_rank"
  pty <- merge(pty, tmp2, by = group.var, all = TRUE)
  colnames(pty)[4] = "true_2nd_rank"
  pty <- merge(pty, tmp3, by = group.var, all = TRUE)
  colnames(pty)[5] = "true_3rd_rank"
  pty <- merge(pty, tmp4, by = group.var, all = TRUE)
  colnames(pty)[6] = "true_4th_rank"
  pty <- merge(pty, tmp5, by = group.var, all = TRUE)
  colnames(pty)[7] = "true_5th_rank"
  
  return(list(true_first = preds[[1,2]], false_first = preds[[1,1]],
              true_second = preds[[2,2]], false_second = preds[[2,1]],
              true_third = preds[[3,2]], false_third = preds[[3,1]],
              true_forth = preds[[4,2]], false_fourth = preds[[4,1]],
              true_fifth = preds[[5,2]], false_fifth = preds[[5,1]],
              distribution_true_coal = dist.of.true,
              probability_true_coal = prob.of.true,
              num_party_diff = numpar.diff, # difference in number of parties between true and predicted coalition
              num_party_diff_2nk = numpar.diff.2,  # difference in number of parties between true (2nd ranked) and predicted coalition
              num_party_diff_3rd = numpar.diff.3,  # difference in number of parties between true (3rd ranked) and predicted coalition
              no_of_formopp = n,
              party_composition = pty,
              prob_dist_2 = prob.dist.2, # distance in predicted probability between predicted coalition and true (2nd ranked) coalition
              prob_dist_3 = prob.dist.3, # distance in predicted probability between predicted coalition and true (3rd ranked) coalition
              similar_dist_2 = similar.dist.2, # distance in similarity score between predicted coalition and true (2nd ranked) coalition
              similar_dist_3 = similar.dist.3)) # distance in similarity score between predicted coalition and true (3rd ranked) coalition
  
}